source('misc_functions/functions.R')
library(tidyverse)
library(plotly)
library(modelr)
library(mgcv)

Using Standard Methods

set.seed(123)
library(mgcv)
dat = gamSim(1, n=400, dist="normal", scale=1, verbose=F)
mod = lm(y ~ x1 + x2, data=dat)
summary(mod)

Call:
lm(formula = y ~ x1 + x2, data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.6634 -1.3220 -0.0571  1.5297  7.0937 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   7.2745     0.3332   21.83   <2e-16 ***
x1            6.3616     0.4352   14.62   <2e-16 ***
x2           -5.1214     0.4526  -11.31   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.532 on 397 degrees of freedom
Multiple R-squared:  0.4594,    Adjusted R-squared:  0.4567 
F-statistic: 168.7 on 2 and 397 DF,  p-value: < 2.2e-16
plot(mod, which = 1:2)

# requires broom and glue packages
broom::augment(mod) %>% 
  ggplot(aes(x=.fitted, y=y)) +
  geom_point(alpha=.25, color='#ff5500') + 
  geom_smooth(se=F, color='#00aaff') +
  annotate('text',
           label = glue::glue("Rsq = {round(summary(mod)$r.squared, 2)}"),
           x = 4,
           y = 16) +
  labs(title='Fitted vs. Observed') +
  theme_minimal()

dat %>% 
  select(x1, x2, y) %>% 
  gather(key=variable, value=Predictor, -y) %>% 
  ggplot(aes(x=Predictor, y=y)) +
  geom_point(alpha=.25, color='#ff5500') + 
  geom_smooth(aes(), color='#00aaff', se=F) +
  facet_grid(~variable) + 
  labs(title='Predictors vs. Y') +
  theme_trueMinimal()

Heteroscedasticity, non-normality, etc.

modlog = lm(log(y) ~ x1 + x2, dat)
summary(modlog)

Call:
lm(formula = log(y) ~ x1 + x2, data = dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-3.07226 -0.13090  0.05803  0.22780  0.79379 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.82673    0.05398  33.839   <2e-16 ***
x1           0.93691    0.07050  13.290   <2e-16 ***
x2          -0.68719    0.07333  -9.371   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4102 on 397 degrees of freedom
Multiple R-squared:  0.3968,    Adjusted R-squared:  0.3937 
F-statistic: 130.6 on 2 and 397 DF,  p-value: < 2.2e-16
plot(modlog, which = 1:2)


broom::augment(mod) %>% 
  ggplot(aes(x=.fitted, y=y)) +
  geom_point(alpha=.25, color='#ff5500') + 
  geom_smooth(se=F, color='#00aaff') +
  annotate('text',
           label = glue::glue("Rsq = {round(summary(modlog)$r.squared, 2)}"),
           x = 4,
           y = 16) +
  labs(title='Fitted vs. Observed') +
  theme_trueMinimal()

Polynomial Regression

set.seed(123)
x = runif(500)
mu = sin(2 * (4 * x - 2)) + 2 * exp(-(16 ^ 2) * ((x - .5) ^ 2))
y = rnorm(500, mu, .3)
d = data.frame(x,y) 

Polynomial regression is problematic

A standard linear regression is definitely not going to capture this relationship. As above, we could try and use polynomial regression here, e.g. fitting a quadratic or cubic function within the standard regression framework. However, this is unrealistic at best and at worst isn’t useful for complex relationships. In the following, even with a polynomial of degree 15 the fit is fairly poor in many areas, and ‘wiggles’ in some places where there doesn’t appear to be a need to.

library(plotly)  # install if you don't have

fits = sapply(seq(3,15, 3), function(p) fitted(lm(y~poly(x,p)))) %>% 
  data.frame(x, y, .) %>% 
  gather(key=polynomial, value=fits, -x, -y) %>% 
  mutate(polynomial = factor(polynomial, labels=seq(3,15, 3)))

plot_ly(data=d) %>% 
  add_markers(~x, ~y, marker=list(color='#ff5500', opacity=.2), showlegend=F) %>% 
  add_lines(~x, ~fits, color=~polynomial, data=fits) %>% 
  theme_plotly()

NA
LS0tCnRpdGxlOiAiVGhlIENhc2UgZm9yIEdBTXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawplZGl0b3Jfb3B0aW9uczogCiAgY2h1bmtfb3V0cHV0X3R5cGU6IGlubGluZQotLS0KCmBgYHtyIG1pc2NfZnVuY3Rpb25zfQpzb3VyY2UoJ21pc2NfZnVuY3Rpb25zL2Z1bmN0aW9ucy5SJykKYGBgCgpgYGB7ciBsb2FkX3BhY2thZ2VzfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShwbG90bHkpCmxpYnJhcnkobW9kZWxyKQpsaWJyYXJ5KG1nY3YpCmBgYAoKCiMjIFVzaW5nIFN0YW5kYXJkIE1ldGhvZHMKCmBgYHtyIHNpbXVsYXRlZF9kYXRhfQpzZXQuc2VlZCgxMjMpCmxpYnJhcnkobWdjdikKZGF0ID0gZ2FtU2ltKDEsIG49NDAwLCBkaXN0PSJub3JtYWwiLCBzY2FsZT0xLCB2ZXJib3NlPUYpCmBgYAoKYGBge3IgZGVtb2xtfQptb2QgPSBsbSh5IH4geDEgKyB4MiwgZGF0YT1kYXQpCnN1bW1hcnkobW9kKQpgYGAKYGBge3IgZGlhZ25vc3RpY3N9CnBsb3QobW9kLCB3aGljaCA9IDE6MikKYGBgCgpgYGB7ciBnZW5lcmFsX2ZpdH0KIyByZXF1aXJlcyBicm9vbSBhbmQgZ2x1ZSBwYWNrYWdlcwpicm9vbTo6YXVnbWVudChtb2QpICU+JSAKICBnZ3Bsb3QoYWVzKHg9LmZpdHRlZCwgeT15KSkgKwogIGdlb21fcG9pbnQoYWxwaGE9LjI1LCBjb2xvcj0nI2ZmNTUwMCcpICsgCiAgZ2VvbV9zbW9vdGgoc2U9RiwgY29sb3I9JyMwMGFhZmYnKSArCiAgYW5ub3RhdGUoJ3RleHQnLAogICAgICAgICAgIGxhYmVsID0gZ2x1ZTo6Z2x1ZSgiUnNxID0ge3JvdW5kKHN1bW1hcnkobW9kKSRyLnNxdWFyZWQsIDIpfSIpLAogICAgICAgICAgIHggPSA0LAogICAgICAgICAgIHkgPSAxNikgKwogIGxhYnModGl0bGU9J0ZpdHRlZCB2cy4gT2JzZXJ2ZWQnKSArCiAgdGhlbWVfbWluaW1hbCgpCmBgYAoKYGBge3IgcmVsYXRpb25zaGlwc30KZGF0ICU+JSAKICBzZWxlY3QoeDEsIHgyLCB5KSAlPiUgCiAgZ2F0aGVyKGtleT12YXJpYWJsZSwgdmFsdWU9UHJlZGljdG9yLCAteSkgJT4lIAogIGdncGxvdChhZXMoeD1QcmVkaWN0b3IsIHk9eSkpICsKICBnZW9tX3BvaW50KGFscGhhPS4yNSwgY29sb3I9JyNmZjU1MDAnKSArIAogIGdlb21fc21vb3RoKGFlcygpLCBjb2xvcj0nIzAwYWFmZicsIHNlPUYpICsKICBmYWNldF9ncmlkKH52YXJpYWJsZSkgKyAKICBsYWJzKHRpdGxlPSdQcmVkaWN0b3JzIHZzLiBZJykgKwogIHRoZW1lX3RydWVNaW5pbWFsKCkKYGBgCgojIyBIZXRlcm9zY2VkYXN0aWNpdHksIG5vbi1ub3JtYWxpdHksIGV0Yy4KCmBgYHtyIGxvZ19ub19oZWxwfQptb2Rsb2cgPSBsbShsb2coeSkgfiB4MSArIHgyLCBkYXQpCnN1bW1hcnkobW9kbG9nKQpwbG90KG1vZGxvZywgd2hpY2ggPSAxOjIpCgpicm9vbTo6YXVnbWVudChtb2QpICU+JSAKICBnZ3Bsb3QoYWVzKHg9LmZpdHRlZCwgeT15KSkgKwogIGdlb21fcG9pbnQoYWxwaGE9LjI1LCBjb2xvcj0nI2ZmNTUwMCcpICsgCiAgZ2VvbV9zbW9vdGgoc2U9RiwgY29sb3I9JyMwMGFhZmYnKSArCiAgYW5ub3RhdGUoJ3RleHQnLAogICAgICAgICAgIGxhYmVsID0gZ2x1ZTo6Z2x1ZSgiUnNxID0ge3JvdW5kKHN1bW1hcnkobW9kbG9nKSRyLnNxdWFyZWQsIDIpfSIpLAogICAgICAgICAgIHggPSA0LAogICAgICAgICAgIHkgPSAxNikgKwogIGxhYnModGl0bGU9J0ZpdHRlZCB2cy4gT2JzZXJ2ZWQnKSArCiAgdGhlbWVfdHJ1ZU1pbmltYWwoKQpgYGAKCgojIyBQb2x5bm9taWFsIFJlZ3Jlc3Npb24KCmBgYHtyIHNpbURhdGF9CnNldC5zZWVkKDEyMykKeCA9IHJ1bmlmKDUwMCkKbXUgPSBzaW4oMiAqICg0ICogeCAtIDIpKSArIDIgKiBleHAoLSgxNiBeIDIpICogKCh4IC0gLjUpIF4gMikpCnkgPSBybm9ybSg1MDAsIG11LCAuMykKZCA9IGRhdGEuZnJhbWUoeCx5KSAKYGBgCgoKIyMjIyBQb2x5bm9taWFsIHJlZ3Jlc3Npb24gaXMgcHJvYmxlbWF0aWMKCkEgc3RhbmRhcmQgbGluZWFyIHJlZ3Jlc3Npb24gaXMgZGVmaW5pdGVseSBub3QgZ29pbmcgdG8gY2FwdHVyZSB0aGlzIHJlbGF0aW9uc2hpcC4gIEFzIGFib3ZlLCB3ZSBjb3VsZCB0cnkgYW5kIHVzZSBwb2x5bm9taWFsIHJlZ3Jlc3Npb24gaGVyZSwgZS5nLiBmaXR0aW5nIGEgcXVhZHJhdGljIG9yIGN1YmljIGZ1bmN0aW9uIHdpdGhpbiB0aGUgc3RhbmRhcmQgcmVncmVzc2lvbiBmcmFtZXdvcmsuICBIb3dldmVyLCB0aGlzIGlzIHVucmVhbGlzdGljIGF0IGJlc3QgYW5kIGF0IHdvcnN0IGlzbid0IHVzZWZ1bCBmb3IgY29tcGxleCByZWxhdGlvbnNoaXBzLiBJbiB0aGUgZm9sbG93aW5nLCBldmVuIHdpdGggYSBwb2x5bm9taWFsIG9mIGRlZ3JlZSAxNSB0aGUgZml0IGlzIGZhaXJseSBwb29yIGluIG1hbnkgYXJlYXMsIGFuZCAnd2lnZ2xlcycgaW4gc29tZSBwbGFjZXMgd2hlcmUgdGhlcmUgZG9lc24ndCBhcHBlYXIgdG8gYmUgYSBuZWVkIHRvLgoKYGBge3Igc2ltRGF0YVBsb3QsIG1lc3NhZ2U9Rn0KZml0cyA9IHNhcHBseShzZXEoMywxNSwgMyksIGZ1bmN0aW9uKHApIGZpdHRlZChsbSh5fnBvbHkoeCxwKSkpKSAlPiUgCiAgZGF0YS5mcmFtZSh4LCB5LCAuKSAlPiUgCiAgZ2F0aGVyKGtleT1wb2x5bm9taWFsLCB2YWx1ZT1maXRzLCAteCwgLXkpICU+JSAKICBtdXRhdGUocG9seW5vbWlhbCA9IGZhY3Rvcihwb2x5bm9taWFsLCBsYWJlbHM9c2VxKDMsMTUsIDMpKSkKCnBsb3RfbHkoZGF0YT1kKSAlPiUgCiAgYWRkX21hcmtlcnMofngsIH55LCBtYXJrZXI9bGlzdChjb2xvcj0nI2ZmNTUwMCcsIG9wYWNpdHk9LjIpLCBzaG93bGVnZW5kPUYpICU+JSAKICBhZGRfbGluZXMofngsIH5maXRzLCBjb2xvcj1+cG9seW5vbWlhbCwgZGF0YT1maXRzKSAlPiUgCiAgdGhlbWVfcGxvdGx5KCkKCmBgYAoKCgo=